In [1]:
# import sys; sys.path.append('.')
from setup import *
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 200)



In [2]:
print('Loading 200k tweets (including metadata) could take a minute or so...')
df = pd.read_csv(os.path.join(DATA_PATH, 'all_tweets.csv'), index_col='id', low_memory=False,
                 quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)
# in iPython Notebook print out df.columns to show that many of them contain dots
# rename the columns to be attribute-name friendly
df.columns = [label.replace('.', '_') for label in df.columns]
print('Done.')


Loading 200k tweets (including metadata) could take a minute or so...
Done.

In [3]:
# compress it using python 3 for future loading with python 3
df.to_csv(os.path.join(DATA_PATH, 'all_tweets.csv.gz'), quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC, compression='gzip')
# with gzip.open(os.path.join(DATA_PATH, 'all_tweets.csv.gz'), 'w') as fout:

While you wait you can browse this map of the metadata that comes with a tweet (a json string)


In [9]:
print('The raw table shape is {}'.format(df.shape))
nonnull_rows = 330
nonnull_cols = 50
df = df.dropna(axis=1, thresh=nonnull_rows)
print('After dropping columns with fewer than {} nonnull values, the table shape is {}'.format(nonnull_rows, df.shape))
df = df.dropna(axis=0, thresh=nonnull_cols)
print('After dropping rows with fewer than {} nonnull values, the table shape is {}'.format(nonnull_cols, df.shape))


# in ipython notebook, explore and describe the DataFrame columns
print('Of the {} columns, {} are actually DataFrames'.format(len(df.columns), sum([not isinstance(df[col], pd.Series) for col in df.columns])))
# remove dataframes with only 2 columns and one is the _str of the other:
for col in df.columns:
    if isinstance(df[col], pd.DataFrame):
        print('Column {} is a {}-wide DataFrame'.format(col, len(df[col].columns)))
        if df[col].columns[1] == df[col].columns[0] + '_str':
            print('Column {} looks easy because it has sub-columns {}'.format(col, df[col].columns))
            df[col] = df[col][df[col].columns[1]]
        else:
            try:
                assert(float(df[col].iloc[:, 0].max()) == float(df[col].iloc[:, 1].max()))
                df[col] = df[col].fillna(-1, inplace=False)
                series = pd.Series([int(Decimal(x)) for x in df[col].iloc[:, 1].values]).astype('int64').copy()
                del df[col]
                df[col] = series
                print('Finished converting column {} to type {}({})'.format(col, type(df[col]), df[col].dtype))
            except:
                print_exc()

print('Of the {} columns, {} are still DataFrames after trying to convert both columns to long integers'.format(
    len(df.columns), sum([not isinstance(df[col], pd.Series) for col in df.columns])))


The raw table shape is (200168, 285)
After dropping columns with fewer than 330 nonnull values, the table shape is (200168, 285)
After dropping rows with fewer than 50 nonnull values, the table shape is (193378, 285)
Of the 285 columns, 8 are actually DataFrames
Column quoted_status_id is a 2-wide DataFrame
Finished converting column quoted_status_id to type <class 'pandas.core.series.Series'>(float64)
Column quoted_status_id_str is a 2-wide DataFrame
Finished converting column quoted_status_id_str to type <class 'pandas.core.series.Series'>(float64)
Column retweeted_status_quoted_status_id is a 2-wide DataFrame
Finished converting column retweeted_status_quoted_status_id to type <class 'pandas.core.series.Series'>(float64)
Column retweeted_status_quoted_status_id_str is a 2-wide DataFrame
Finished converting column retweeted_status_quoted_status_id_str to type <class 'pandas.core.series.Series'>(float64)
Of the 281 columns, 0 are still DataFrames after trying to convert both columns to long integers

In [10]:
print('df.describe() stats:')
desc = df.describe()
for col, stats in desc.T.iterrows():
    print('')
    print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col])))
    print(stats)


df.describe() stats:
/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
favorite_count (int64)
count    193378.000000
mean          0.629679
std           6.251319
             ...      
50%           0.000000
75%           0.000000
max        1165.000000
Name: favorite_count, dtype: float64

id_str (int64)
count    1.933780e+05
mean     7.274888e+17
std      3.778481e+15
             ...     
50%      7.271970e+17
75%      7.301318e+17
max      7.345639e+17
Name: id_str, dtype: float64

in_reply_to_status_id (float64)
count    1.116500e+04
mean     7.270313e+17
std      1.485877e+16
             ...     
50%               NaN
75%               NaN
max      7.345608e+17
Name: in_reply_to_status_id, dtype: float64

in_reply_to_status_id_str (float64)
count    1.116500e+04
mean     7.270313e+17
std      1.485877e+16
             ...     
50%               NaN
75%               NaN
max      7.345608e+17
Name: in_reply_to_status_id_str, dtype: float64

in_reply_to_user_id (float64)
count    1.300700e+04
mean     2.245103e+16
std      1.249313e+17
             ...     
50%               NaN
75%               NaN
max      7.338248e+17
Name: in_reply_to_user_id, dtype: float64

in_reply_to_user_id_str (float64)
count    1.300700e+04
mean     2.245103e+16
std      1.249313e+17
             ...     
50%               NaN
75%               NaN
max      7.338248e+17
Name: in_reply_to_user_id_str, dtype: float64

lat (float64)
count    643.000000
mean      33.957111
std       16.306535
            ...    
50%             NaN
75%             NaN
max       59.800737
Name: lat, dtype: float64

lon (float64)
count    643.000000
mean     -57.314729
std       70.435494
            ...    
50%             NaN
75%             NaN
max      151.735644
Name: lon, dtype: float64

quoted_status_favorite_count (float64)
count      1698.000000
mean        361.769140
std        3222.551118
             ...      
50%                NaN
75%                NaN
max      109888.000000
Name: quoted_status_favorite_count, dtype: float64

quoted_status_retweet_count (float64)
count     1698.000000
mean       298.006478
std       2681.634140
             ...     
50%               NaN
75%               NaN
max      84527.000000
Name: quoted_status_retweet_count, dtype: float64

quoted_status_user_favourites_count (float64)
count      1698.000000
mean       7707.139576
std       26101.078684
             ...      
50%                NaN
75%                NaN
max      640291.000000
Name: quoted_status_user_favourites_count, dtype: float64

quoted_status_user_followers_count (float64)
count    1.698000e+03
mean     3.351078e+05
std      2.200700e+06
             ...     
50%               NaN
75%               NaN
max      5.992102e+07
Name: quoted_status_user_followers_count, dtype: float64

quoted_status_user_friends_count (float64)
count      1698.000000
mean       3878.302709
std       14818.430933
             ...      
50%                NaN
75%                NaN
max      181261.000000
Name: quoted_status_user_friends_count, dtype: float64

quoted_status_user_id (float64)
count    1.698000e+03
mean     2.315020e+16
std      1.265821e+17
             ...     
50%               NaN
75%               NaN
max      7.322411e+17
Name: quoted_status_user_id, dtype: float64

quoted_status_user_id_str (float64)
count    1.698000e+03
mean     2.315020e+16
std      1.265821e+17
             ...     
50%               NaN
75%               NaN
max      7.322411e+17
Name: quoted_status_user_id_str, dtype: float64

quoted_status_user_listed_count (float64)
count      1698.000000
mean       3456.337456
std       17113.085283
             ...      
50%                NaN
75%                NaN
max      173929.000000
Name: quoted_status_user_listed_count, dtype: float64

quoted_status_user_statuses_count (float64)
count      1698.000000
mean      26693.128386
std       49781.314248
             ...      
50%                NaN
75%                NaN
max      354746.000000
Name: quoted_status_user_statuses_count, dtype: float64

quoted_status_user_utc_offset (float64)
count     1406.000000
mean     -8585.206259
std      16993.135149
             ...     
50%               NaN
75%               NaN
max      43200.000000
Name: quoted_status_user_utc_offset, dtype: float64

retweet_count (int64)
count    193378.000000
mean         53.567846
std         877.497404
             ...      
50%           0.000000
75%           4.000000
max      166648.000000
Name: retweet_count, dtype: float64

retweeted_status_favorite_count (float64)
count     69423.000000
mean        156.261441
std        1597.516635
             ...      
50%                NaN
75%                NaN
max      215360.000000
Name: retweeted_status_favorite_count, dtype: float64

retweeted_status_id (float64)
count    6.942300e+04
mean     7.246058e+17
std      2.900791e+16
             ...     
50%               NaN
75%               NaN
max      7.345625e+17
Name: retweeted_status_id, dtype: float64

retweeted_status_id_str (float64)
count    6.942300e+04
mean     7.246058e+17
std      2.900791e+16
             ...     
50%               NaN
75%               NaN
max      7.345625e+17
Name: retweeted_status_id_str, dtype: float64

retweeted_status_in_reply_to_status_id (float64)
count    2.295000e+03
mean     7.242244e+17
std      2.985485e+16
             ...     
50%               NaN
75%               NaN
max      7.345272e+17
Name: retweeted_status_in_reply_to_status_id, dtype: float64

retweeted_status_in_reply_to_status_id_str (float64)
count    2.295000e+03
mean     7.242244e+17
std      2.985485e+16
             ...     
50%               NaN
75%               NaN
max      7.345272e+17
Name: retweeted_status_in_reply_to_status_id_str, dtype: float64

retweeted_status_in_reply_to_user_id (float64)
count    2.802000e+03
mean     1.101349e+16
std      8.824095e+16
             ...     
50%               NaN
75%               NaN
max      7.273329e+17
Name: retweeted_status_in_reply_to_user_id, dtype: float64

retweeted_status_in_reply_to_user_id_str (float64)
count    2.802000e+03
mean     1.101349e+16
std      8.824095e+16
             ...     
50%               NaN
75%               NaN
max      7.273329e+17
Name: retweeted_status_in_reply_to_user_id_str, dtype: float64

retweeted_status_quoted_status_favorite_count (float64)
count      2162.000000
mean        294.716004
std        6840.594249
             ...      
50%                NaN
75%                NaN
max      311618.000000
Name: retweeted_status_quoted_status_favorite_count, dtype: float64

retweeted_status_quoted_status_retweet_count (float64)
count      2162.000000
mean        319.838575
std        8842.632116
             ...      
50%                NaN
75%                NaN
max      406556.000000
Name: retweeted_status_quoted_status_retweet_count, dtype: float64

retweeted_status_quoted_status_user_favourites_count (float64)
count      1071.000000
mean       4255.130719
std       13101.264209
             ...      
50%                NaN
75%                NaN
max      269482.000000
Name: retweeted_status_quoted_status_user_favourites_count, dtype: float64

retweeted_status_quoted_status_user_followers_count (float64)
count    1.071000e+03
mean     6.740821e+05
std      1.972163e+06
             ...     
50%               NaN
75%               NaN
max      6.762776e+06
Name: retweeted_status_quoted_status_user_followers_count, dtype: float64

retweeted_status_quoted_status_user_friends_count (float64)
count     1071.000000
mean      3565.981326
std      13356.713457
             ...     
50%               NaN
75%               NaN
max      96189.000000
Name: retweeted_status_quoted_status_user_friends_count, dtype: float64

retweeted_status_quoted_status_user_id (float64)
count    1.071000e+03
mean     4.383884e+16
std      1.711556e+17
             ...     
50%               NaN
75%               NaN
max      7.287328e+17
Name: retweeted_status_quoted_status_user_id, dtype: float64

retweeted_status_quoted_status_user_id_str (float64)
count    1.071000e+03
mean     4.383884e+16
std      1.711556e+17
             ...     
50%               NaN
75%               NaN
max      7.287328e+17
Name: retweeted_status_quoted_status_user_id_str, dtype: float64

retweeted_status_quoted_status_user_listed_count (float64)
count     1071.000000
mean      1733.955182
std       3274.306542
             ...     
50%               NaN
75%               NaN
max      21608.000000
Name: retweeted_status_quoted_status_user_listed_count, dtype: float64

retweeted_status_quoted_status_user_statuses_count (float64)
count      1071.000000
mean      43782.852474
std       81090.258295
             ...      
50%                NaN
75%                NaN
max      354746.000000
Name: retweeted_status_quoted_status_user_statuses_count, dtype: float64

retweeted_status_quoted_status_user_utc_offset (float64)
count      969.000000
mean     -5814.241486
std      17857.526235
             ...     
50%               NaN
75%               NaN
max      43200.000000
Name: retweeted_status_quoted_status_user_utc_offset, dtype: float64

retweeted_status_retweet_count (float64)
count     69423.000000
mean        148.140818
std        1459.734542
             ...      
50%                NaN
75%                NaN
max      166648.000000
Name: retweeted_status_retweet_count, dtype: float64

retweeted_status_user_favourites_count (float64)
count     69423.000000
mean       7552.074874
std       22786.294974
             ...      
50%                NaN
75%                NaN
max      424498.000000
Name: retweeted_status_user_favourites_count, dtype: float64

retweeted_status_user_followers_count (float64)
count    6.942300e+04
mean     1.420601e+05
std      8.697409e+05
             ...     
50%               NaN
75%               NaN
max      3.850754e+07
Name: retweeted_status_user_followers_count, dtype: float64

retweeted_status_user_friends_count (float64)
count    6.942300e+04
mean     7.067446e+03
std      3.476161e+04
             ...     
50%               NaN
75%               NaN
max      4.717316e+06
Name: retweeted_status_user_friends_count, dtype: float64

retweeted_status_user_id (float64)
count    6.942300e+04
mean     3.075018e+16
std      1.450819e+17
             ...     
50%               NaN
75%               NaN
max      7.340921e+17
Name: retweeted_status_user_id, dtype: float64

retweeted_status_user_id_str (float64)
count    6.942300e+04
mean     3.075018e+16
std      1.450819e+17
             ...     
50%               NaN
75%               NaN
max      7.340921e+17
Name: retweeted_status_user_id_str, dtype: float64

retweeted_status_user_listed_count (float64)
count     69423.000000
mean       1311.707316
std        5366.950887
             ...      
50%                NaN
75%                NaN
max      173930.000000
Name: retweeted_status_user_listed_count, dtype: float64

retweeted_status_user_statuses_count (float64)
count    6.942300e+04
mean     2.441676e+04
std      5.399568e+04
             ...     
50%               NaN
75%               NaN
max      1.277548e+06
Name: retweeted_status_user_statuses_count, dtype: float64

retweeted_status_user_utc_offset (float64)
count    53755.000000
mean     -8987.895080
std      15669.797442
             ...     
50%               NaN
75%               NaN
max      46800.000000
Name: retweeted_status_user_utc_offset, dtype: float64

user_favourites_count (int64)
count    193378.000000
mean       2981.352750
std       12739.631358
             ...      
50%          73.000000
75%        1020.750000
max      673894.000000
Name: user_favourites_count, dtype: float64

user_followers_count (int64)
count    1.933780e+05
mean     3.443857e+03
std      5.943546e+04
             ...     
50%      4.480000e+02
75%      1.141000e+03
max      1.038394e+07
Name: user_followers_count, dtype: float64

user_friends_count (int64)
count    193378.000000
mean       1428.301570
std        5848.678639
             ...      
50%         343.000000
75%        1090.000000
max      382464.000000
Name: user_friends_count, dtype: float64

user_id (int64)
count    1.933780e+05
mean     5.724701e+16
std      1.942174e+17
             ...     
50%      1.492945e+09
75%      3.333417e+09
max      7.342205e+17
Name: user_id, dtype: float64

user_id_str (int64)
count    1.933780e+05
mean     5.724701e+16
std      1.942174e+17
             ...     
50%      1.492945e+09
75%      3.333417e+09
max      7.342205e+17
Name: user_id_str, dtype: float64

user_listed_count (int64)
count    193378.000000
mean        353.925746
std        1126.620779
             ...      
50%          78.000000
75%         248.000000
max      129229.000000
Name: user_listed_count, dtype: float64

user_statuses_count (int64)
count    1.933780e+05
mean     6.134178e+04
std      1.382712e+05
             ...     
50%      1.113600e+04
75%      5.799700e+04
max      2.537204e+06
Name: user_statuses_count, dtype: float64

user_utc_offset (float64)
count    119043.000000
mean      -6377.122552
std       18027.953290
             ...      
50%                NaN
75%                NaN
max       46800.000000
Name: user_utc_offset, dtype: float64

quoted_status_id (float64)
count    0.0
mean     NaN
std      NaN
        ... 
50%      NaN
75%      NaN
max      NaN
Name: quoted_status_id, dtype: float64

quoted_status_id_str (float64)
count    0.0
mean     NaN
std      NaN
        ... 
50%      NaN
75%      NaN
max      NaN
Name: quoted_status_id_str, dtype: float64

retweeted_status_quoted_status_id (float64)
count    0.0
mean     NaN
std      NaN
        ... 
50%      NaN
75%      NaN
max      NaN
Name: retweeted_status_quoted_status_id, dtype: float64

retweeted_status_quoted_status_id_str (float64)
count    0.0
mean     NaN
std      NaN
        ... 
50%      NaN
75%      NaN
max      NaN
Name: retweeted_status_quoted_status_id_str, dtype: float64

In [13]:
df


Out[13]:
coordinates_coordinates coordinates_type created_at entities_hashtags entities_media entities_symbols entities_urls entities_user_mentions favorite_count favorited geo_coordinates geo_type id_str in_reply_to_screen_name in_reply_to_status_id in_reply_to_status_id_str in_reply_to_user_id in_reply_to_user_id_str is_quote_status lang lat lon metadata_iso_language_code metadata_result_type place_bounding_box_coordinates place_bounding_box_type place_contained_within place_country place_country_code place_full_name place_id place_name place_place_type place_url possibly_sensitive quoted_status_created_at quoted_status_entities_hashtags quoted_status_entities_media quoted_status_entities_symbols quoted_status_entities_urls quoted_status_entities_user_mentions quoted_status_favorite_count quoted_status_favorited quoted_status_is_quote_status quoted_status_lang quoted_status_metadata_iso_language_code quoted_status_metadata_result_type quoted_status_possibly_sensitive quoted_status_retweet_count quoted_status_retweeted quoted_status_source quoted_status_text quoted_status_truncated quoted_status_user_contributors_enabled quoted_status_user_created_at quoted_status_user_default_profile quoted_status_user_default_profile_image quoted_status_user_description quoted_status_user_entities_description_urls quoted_status_user_entities_url_urls quoted_status_user_favourites_count quoted_status_user_followers_count quoted_status_user_friends_count quoted_status_user_geo_enabled quoted_status_user_has_extended_profile quoted_status_user_id quoted_status_user_id_str quoted_status_user_is_translation_enabled quoted_status_user_is_translator quoted_status_user_lang quoted_status_user_listed_count quoted_status_user_location quoted_status_user_name quoted_status_user_profile_background_color quoted_status_user_profile_background_image_url quoted_status_user_profile_background_image_url_https quoted_status_user_profile_background_tile quoted_status_user_profile_banner_url quoted_status_user_profile_image_url quoted_status_user_profile_image_url_https quoted_status_user_profile_link_color quoted_status_user_profile_sidebar_border_color quoted_status_user_profile_sidebar_fill_color quoted_status_user_profile_text_color quoted_status_user_profile_use_background_image quoted_status_user_protected quoted_status_user_screen_name quoted_status_user_statuses_count quoted_status_user_time_zone quoted_status_user_url quoted_status_user_utc_offset quoted_status_user_verified retweet_count retweeted retweeted_status_created_at retweeted_status_entities_hashtags retweeted_status_entities_media retweeted_status_entities_symbols retweeted_status_entities_urls retweeted_status_entities_user_mentions ... retweeted_status_quoted_status_user_profile_sidebar_fill_color retweeted_status_quoted_status_user_profile_text_color retweeted_status_quoted_status_user_profile_use_background_image retweeted_status_quoted_status_user_protected retweeted_status_quoted_status_user_screen_name retweeted_status_quoted_status_user_statuses_count retweeted_status_quoted_status_user_time_zone retweeted_status_quoted_status_user_url retweeted_status_quoted_status_user_utc_offset retweeted_status_quoted_status_user_verified retweeted_status_retweet_count retweeted_status_retweeted retweeted_status_source retweeted_status_text retweeted_status_truncated retweeted_status_user_contributors_enabled retweeted_status_user_created_at retweeted_status_user_default_profile retweeted_status_user_default_profile_image retweeted_status_user_description retweeted_status_user_entities_description_urls retweeted_status_user_entities_url_urls retweeted_status_user_favourites_count retweeted_status_user_followers_count retweeted_status_user_friends_count retweeted_status_user_geo_enabled retweeted_status_user_has_extended_profile retweeted_status_user_id retweeted_status_user_id_str retweeted_status_user_is_translation_enabled retweeted_status_user_is_translator retweeted_status_user_lang retweeted_status_user_listed_count retweeted_status_user_location retweeted_status_user_name retweeted_status_user_profile_background_color retweeted_status_user_profile_background_image_url retweeted_status_user_profile_background_image_url_https retweeted_status_user_profile_background_tile retweeted_status_user_profile_banner_url retweeted_status_user_profile_image_url retweeted_status_user_profile_image_url_https retweeted_status_user_profile_link_color retweeted_status_user_profile_sidebar_border_color retweeted_status_user_profile_sidebar_fill_color retweeted_status_user_profile_text_color retweeted_status_user_profile_use_background_image retweeted_status_user_protected retweeted_status_user_screen_name retweeted_status_user_statuses_count retweeted_status_user_time_zone retweeted_status_user_url retweeted_status_user_utc_offset retweeted_status_user_verified source text truncated user_contributors_enabled user_created_at user_default_profile user_default_profile_image user_description user_entities_description_urls user_entities_url_urls user_favourites_count user_followers_count user_friends_count user_geo_enabled user_has_extended_profile user_id user_id_str user_is_translation_enabled user_is_translator user_lang user_listed_count user_location user_name user_profile_background_color user_profile_background_image_url user_profile_background_image_url_https user_profile_background_tile user_profile_banner_url user_profile_image_url user_profile_image_url_https user_profile_link_color user_profile_sidebar_border_color user_profile_sidebar_fill_color user_profile_text_color user_profile_use_background_image user_protected user_screen_name user_statuses_count user_time_zone user_url user_utc_offset user_verified quoted_status_id quoted_status_id_str retweeted_status_quoted_status_id retweeted_status_quoted_status_id_str
id
731122251278499841 NaN NaN Fri May 13 14:01:42 +0000 2016 [{u'indices': [47, 52], u'text': u'Java'}, {u'... [{u'source_user_id': 150820027, u'source_statu... [] [{u'url': u'https://t.co/SVgMAwNxxj', u'indice... [{u'indices': [3, 17], u'id_str': u'150820027'... 0 False NaN NaN 731122251278499841 NaN NaN NaN NaN NaN False en NaN NaN en recent NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 14 False Fri May 13 13:30:47 +0000 2016 [{u'indices': [28, 33], u'text': u'Java'}, {u'... [{u'expanded_url': u'http://twitter.com/javaco... [] [{u'url': u'https://t.co/SVgMAwNxxj', u'indice... [] ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 14.0 False <a href="http://bufferapp.com" rel="nofollow">... Top Performance Metrics for #Java, .NET, #PHP,... False False Tue Jun 01 22:38:53 +0000 2010 False False Java developers resource center. JCGs is one o... [] [{u'url': u'http://t.co/DivczES801', u'indices... 0.0 90268.0 130.0 False False 150820027.0 150820027.0 False False en 1717.0 NaN Java Code Geeks ACDED6 http://abs.twimg.com/images/themes/theme18/bg.gif https://abs.twimg.com/images/themes/theme18/bg... False https://pbs.twimg.com/profile_banners/15082002... http://pbs.twimg.com/profile_images/2928906892... https://pbs.twimg.com/profile_images/292890689... 038543 EEEEEE F6F6F6 333333 True False javacodegeeks 37567.0 Athens http://t.co/DivczES801 10800.0 False <a href="http://twitter.com" rel="nofollow">Tw... RT @javacodegeeks: Top Performance Metrics for... False False Wed Aug 12 15:20:38 +0000 2009 False False Husband, Father, Programmer, Gamer, Graphic De... [] NaN 845 221 709 False False 65061698 65061698 False False en 8 NaN Greg Herhuth 000000 http://abs.twimg.com/images/themes/theme9/bg.gif https://abs.twimg.com/images/themes/theme9/bg.gif False https://pbs.twimg.com/profile_banners/65061698... http://pbs.twimg.com/profile_images/7228456300... https://pbs.twimg.com/profile_images/722845630... 3B94D9 000000 000000 000000 False False zamajam 579 Eastern Time (US & Canada) NaN -14400.0 False NaN NaN NaN NaN
724281574129180672 NaN NaN Sun Apr 24 16:59:18 +0000 2016 [] NaN [] [{u'url': u'https://t.co/HshSAeTMYc', u'indice... [] 0 False NaN NaN 724281574129180672 NaN NaN NaN NaN NaN False en NaN NaN en recent NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 False NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN <a href="http://twitterfeed.com" rel="nofollow... World's Largest Python Discovered in Nepal: WA... False False Tue Mar 24 14:13:53 +0000 2015 True False NaN [] [{u'url': u'http://t.co/mkBfH8QmsX', u'indices... 0 776 1910 True False 3110463964 3110463964 False False en 4 Lokoja, Kogi State, Nigeria. Ukpe Thompson C0DEED http://abs.twimg.com/images/themes/theme1/bg.png https://abs.twimg.com/images/themes/theme1/bg.png False https://pbs.twimg.com/profile_banners/31104639... http://pbs.twimg.com/profile_images/5852217706... https://pbs.twimg.com/profile_images/585221770... 0084B4 C0DEED DDEEF6 333333 True False newsymag 2159 Pacific Time (US & Canada) http://t.co/mkBfH8QmsX -25200.0 False NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
724275578879111169 NaN NaN Sun Apr 24 16:35:29 +0000 2016 [] NaN [] [{u'indices': [26, 49], u'url': u'https://t.co... [] 0 False NaN NaN 724275578879111169 NaN NaN NaN NaN NaN False en NaN NaN en recent NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 False NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN <a href="https://path.com/" rel="nofollow">Pat... Watching Boa vs. Python — https://t.co/5THbrirfQO False False Wed Oct 05 01:11:53 +0000 2011 False False | Vocal @MEMORIES_MTL | Di Doan Ibu, ku Dengar... [] [{u'indices': [0, 22], u'url': u'http://t.co/j... 105 819 275 True False 385181009 385181009 False False id 1 PLBNG - MGL ﺳﻮﺭﻳﺎ 020305 http://pbs.twimg.com/profile_background_images... https://pbs.twimg.com/profile_background_image... True https://pbs.twimg.com/profile_banners/38518100... http://pbs.twimg.com/profile_images/7056528218... https://pbs.twimg.com/profile_images/705652821... 2FC2EF 000000 252429 666666 True False bismillah____ 59510 Bangkok http://t.co/jgsHtjOt6x 25200.0 False NaN NaN NaN NaN
724275568871673857 NaN NaN Sun Apr 24 16:35:26 +0000 2016 [] NaN [] [{u'indices': [115, 138], u'url': u'https://t.... [] 0 False NaN NaN 724275568871673857 NaN NaN NaN NaN NaN False ru NaN NaN ru recent NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 False NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN <a href="http://vk.com" rel="nofollow">vk.com ... Чертова дюжина вакансий в IT и Digital / / 1.... False False Sun May 22 03:29:30 +0000 2011 False False NaN [] NaN 1 61 15 False False 302987528 302987528 False False ru 4 Rus Alex Birgazov C0DEED http://pbs.twimg.com/profile_background_images... https://pbs.twimg.com/profile_background_image... False https://pbs.twimg.com/profile_banners/30298752... http://pbs.twimg.com/profile_images/1364034429... https://pbs.twimg.com/profile_images/136403442... 0084B4 FFFFFF DDEEF6 333333 True False weelman93 124 Irkutsk NaN 28800.0 False NaN NaN NaN NaN

193378 rows × 281 columns


In [18]:
stats = df.describe()
columns = [c for c in stats.columns if stats[c]['count'] > 10000 or 'fav' in c or 'retweet' in c or df[c].dtype in (int, float, np.float64)] + ['text', 'favorite_count', 'geo_coordinates']
print(df.shape)
print(df[columns].shape)
for c in columns:
    print(c)
df.text


/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
(193378, 281)
(193378, 60)
favorite_count
id_str
in_reply_to_status_id
in_reply_to_status_id_str
in_reply_to_user_id
in_reply_to_user_id_str
lat
lon
quoted_status_favorite_count
quoted_status_retweet_count
quoted_status_user_favourites_count
quoted_status_user_followers_count
quoted_status_user_friends_count
quoted_status_user_id
quoted_status_user_id_str
quoted_status_user_listed_count
quoted_status_user_statuses_count
quoted_status_user_utc_offset
retweet_count
retweeted_status_favorite_count
retweeted_status_id
retweeted_status_id_str
retweeted_status_in_reply_to_status_id
retweeted_status_in_reply_to_status_id_str
retweeted_status_in_reply_to_user_id
retweeted_status_in_reply_to_user_id_str
retweeted_status_quoted_status_favorite_count
retweeted_status_quoted_status_retweet_count
retweeted_status_quoted_status_user_favourites_count
retweeted_status_quoted_status_user_followers_count
retweeted_status_quoted_status_user_friends_count
retweeted_status_quoted_status_user_id
retweeted_status_quoted_status_user_id_str
retweeted_status_quoted_status_user_listed_count
retweeted_status_quoted_status_user_statuses_count
retweeted_status_quoted_status_user_utc_offset
retweeted_status_retweet_count
retweeted_status_user_favourites_count
retweeted_status_user_followers_count
retweeted_status_user_friends_count
retweeted_status_user_id
retweeted_status_user_id_str
retweeted_status_user_listed_count
retweeted_status_user_statuses_count
retweeted_status_user_utc_offset
user_favourites_count
user_followers_count
user_friends_count
user_id
user_id_str
user_listed_count
user_statuses_count
user_utc_offset
quoted_status_id
quoted_status_id_str
retweeted_status_quoted_status_id
retweeted_status_quoted_status_id_str
text
favorite_count
geo_coordinates
Out[18]:
id
731122251278499841    RT @javacodegeeks: Top Performance Metrics for...
724281574129180672    World's Largest Python Discovered in Nepal: WA...
                                            ...                        
724275578879111169    Watching Boa vs. Python — https://t.co/5THbrirfQO
724275568871673857    Чертова дюжина вакансий в IT и Digital /  / 1....
Name: text, dtype: object

In [19]:
df[columns].to_csv(os.path.join(DATA_PATH, 'cleaned_tweets.csv.gz'), compression='gzip', encoding='UTF-8', quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)

In [20]:
rawlen = len(df)
df.drop_duplicates('id_str', keep='last', inplace=True)
rawlen - len(df)


Out[20]:
10308

In [21]:
df[columns].to_csv(os.path.join(DATA_PATH, 'deduped_tweets.csv.gz'), compression='gzip', encoding='UTF-8', quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)

In [ ]: